Code
pip install requests beautifulsoup4 selenium pandas lxmlTony Duan
Web scraping is the process of automatically extracting data from websites. Python is one of the most popular languages for web scraping due to its powerful libraries and simple syntax.
requests, BeautifulSoup, SeleniumMake sure you have these libraries installed:
| Library | Purpose | Best For |
|---|---|---|
requests |
HTTP requests | Simple static websites |
BeautifulSoup |
HTML parsing | Parsing and extracting data |
Selenium |
Browser automation | Dynamic/JavaScript websites |
pandas |
Data manipulation | Organizing scraped data |
The requests library is the foundation for most web scraping tasks.
import requests
from bs4 import BeautifulSoup
import pandas as pd
# Basic GET request
url = "http://books.toscrape.com/"
response = requests.get(url)
# Check if request was successful
if response.status_code == 200:
print("Successfully fetched the page!")
html_content = response.text
else:
print(f"Error: {response.status_code}")Successfully fetched the page!
Always set proper headers to mimic a real browser:
BeautifulSoup makes it easy to navigate and extract data from HTML.
from bs4 import BeautifulSoup
# Create BeautifulSoup object
soup = BeautifulSoup(html_content, 'html.parser')
# Find book titles
book_titles = soup.find_all('h3')
# Find book prices
book_prices = soup.find_all('p', class_='price_color')
# Find all book containers
books = soup.find_all('article', class_='product_pod')
# Extract book information
for book in books[:3]: # Show first 3 books as example
title = book.h3.a['title']
price = book.find('p', class_='price_color').text
print(f"Title: {title}, Price: {price}")Title: A Light in the Attic, Price: £51.77
Title: Tipping the Velvet, Price: £53.74
Title: Soumission, Price: £50.10
# CSS selectors for books
books = soup.select('.product_pod')
# Complex selectors for prices
prices = soup.select('p.price_color')
# Get parent/children elements - fix the iteration
for price in prices[:5]: # Just use 5 price as example
book_container = price.find_parent('article', class_='product_pod')
print(f"Found parent container: {book_container.h3.a['title']}")Found parent container: A Light in the Attic
Found parent container: Tipping the Velvet
Found parent container: Soumission
Found parent container: Sharp Objects
Found parent container: Sapiens: A Brief History of Humankind
import requests
from bs4 import BeautifulSoup
import pandas as pd
def scrape_books(url):
"""Scrape book information from a bookstore website"""
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
}
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.content, 'html.parser')
books_data = []
# Find all book containers
books = soup.find_all('article', class_='product_pod')
for book in books:
title = book.h3.a['title']
price = book.find('p', class_='price_color').text
stock = book.find('p', class_='instock availability').text.strip()
rating = book.p['class'][1] # Second class contains rating
books_data.append({
'title': title,
'price': price,
'stock': stock,
'rating': rating
})
return pd.DataFrame(books_data)
# Example usage
books_df = scrape_books('http://books.toscrape.com/')
print(books_df.head()) title price stock rating
0 A Light in the Attic £51.77 In stock Three
1 Tipping the Velvet £53.74 In stock One
2 Soumission £50.10 In stock One
3 Sharp Objects £47.82 In stock Four
4 Sapiens: A Brief History of Humankind £54.23 In stock Five
Always check a website’s robots.txt file:
Be respectful and don’t overwhelm servers:
import time
import random
def polite_requests(urls, delay_range=(1, 3)):
"""Make requests with random delays"""
for url in urls:
response = requests.get(url)
# Random delay between requests
time.sleep(random.uniform(*delay_range))
yield response
# Example usage:
# urls = [
# "http://books.toscrape.com/catalogue/page-1.html",
# "http://books.toscrape.com/catalogue/page-2.html",
# "http://books.toscrape.com/catalogue/page-3.html"
# ]
# for response in polite_requests(urls):
# print(f"Fetched: {response.url}")Always implement proper error handling:
def safe_request(url, max_retries=3):
"""Make HTTP requests with retry logic"""
for attempt in range(max_retries):
try:
response = requests.get(url, timeout=10)
response.raise_for_status() # Raise exception for bad status codes
return response
except requests.exceptions.RequestException as e:
print(f"Attempt {attempt + 1} failed: {e}")
if attempt < max_retries - 1:
time.sleep(2 ** attempt) # Exponential backoff
else:
raise
# Example usage:
# response = safe_request("http://books.toscrape.com/")
# print("Successfully fetched page!")def extract_books_from_page(html_content):
"""Extract book data from HTML content"""
soup = BeautifulSoup(html_content, "html.parser")
books = soup.find_all("article", class_="product_pod")
page_data = []
for book in books:
title = book.h3.a["title"]
price = book.find("p", class_="price_color").text
stock = book.find("p", class_="instock availability").text.strip()
rating = book.p["class"][1]
page_data.append(
{"title": title, "price": price, "stock": stock, "rating": rating}
)
return page_data
def scrape_multiple_pages(base_url, max_pages=5):
"""Scrape multiple pages of a website"""
all_data = []
for page in range(1, max_pages + 1):
url = f"{base_url}catalogue/page-{page}.html"
response = requests.get(url)
if response.status_code == 200:
# Extract data from current page
page_data = extract_books_from_page(response.content)
all_data.extend(page_data)
print(f"Scraped {len(page_data)} books from page {page}")
else:
print(f"Failed to fetch page {page}")
break
return all_data
# Example usage:
books_data = scrape_multiple_pages("http://books.toscrape.com/", 3)
print(f"Scraped {len(books_data)} books from 3 pages")Scraped 20 books from page 1
Scraped 20 books from page 2
Scraped 20 books from page 3
Scraped 60 books from 3 pages
def login_and_scrape(login_url, target_url, credentials):
"""Handle login before scraping"""
session = requests.Session()
# Login
login_data = {
'username': credentials['username'],
'password': credentials['password']
}
response = session.post(login_url, data=login_data)
# Scrape protected content
if response.status_code == 200:
protected_response = session.get(target_url)
return protected_response.content---
title: "Web Scraping with Python"
author: "Tony Duan"
execute:
warning: false
error: false
format:
html:
toc: true
toc-location: right
code-fold: show
code-tools: true
code-block-bg: true
code-block-border-left: "#31BAE9"
code-copy: true
---
# Introduction to Web Scraping with Python
Web scraping is the process of automatically extracting data from websites. Python is one of the most popular languages for web scraping due to its powerful libraries and simple syntax.
## What You'll Learn
- Basic concepts of web scraping
- Using popular Python libraries: `requests`, `BeautifulSoup`, `Selenium`
- Best practices and ethical considerations
- Real-world examples
## Prerequisites
Make sure you have these libraries installed:
```{python}
#| eval: false
pip install requests beautifulsoup4 selenium pandas lxml
```
## Essential Libraries
| Library | Purpose | Best For |
|---------|---------|----------|
| `requests` | HTTP requests | Simple static websites |
| `BeautifulSoup` | HTML parsing | Parsing and extracting data |
| `Selenium` | Browser automation | Dynamic/JavaScript websites |
| `pandas` | Data manipulation | Organizing scraped data |
---
## Getting Started with Requests
The `requests` library is the foundation for most web scraping tasks.
```{python}
import requests
from bs4 import BeautifulSoup
import pandas as pd
# Basic GET request
url = "http://books.toscrape.com/"
response = requests.get(url)
# Check if request was successful
if response.status_code == 200:
print("Successfully fetched the page!")
html_content = response.text
else:
print(f"Error: {response.status_code}")
```
### Headers and User-Agent
Always set proper headers to mimic a real browser:
```{python}
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}
response = requests.get(url, headers=headers)
```
---
## Parsing HTML with BeautifulSoup
BeautifulSoup makes it easy to navigate and extract data from HTML.
```{python}
from bs4 import BeautifulSoup
# Create BeautifulSoup object
soup = BeautifulSoup(html_content, 'html.parser')
# Find book titles
book_titles = soup.find_all('h3')
# Find book prices
book_prices = soup.find_all('p', class_='price_color')
# Find all book containers
books = soup.find_all('article', class_='product_pod')
# Extract book information
for book in books[:3]: # Show first 3 books as example
title = book.h3.a['title']
price = book.find('p', class_='price_color').text
print(f"Title: {title}, Price: {price}")
```
### Advanced Selectors
```{python}
# CSS selectors for books
books = soup.select('.product_pod')
# Complex selectors for prices
prices = soup.select('p.price_color')
# Get parent/children elements - fix the iteration
for price in prices[:5]: # Just use 5 price as example
book_container = price.find_parent('article', class_='product_pod')
print(f"Found parent container: {book_container.h3.a['title']}")
```
---
## Real-World Example: Scraping Book Information
```{python}
import requests
from bs4 import BeautifulSoup
import pandas as pd
def scrape_books(url):
"""Scrape book information from a bookstore website"""
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
}
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.content, 'html.parser')
books_data = []
# Find all book containers
books = soup.find_all('article', class_='product_pod')
for book in books:
title = book.h3.a['title']
price = book.find('p', class_='price_color').text
stock = book.find('p', class_='instock availability').text.strip()
rating = book.p['class'][1] # Second class contains rating
books_data.append({
'title': title,
'price': price,
'stock': stock,
'rating': rating
})
return pd.DataFrame(books_data)
# Example usage
books_df = scrape_books('http://books.toscrape.com/')
print(books_df.head())
```
---
## Best Practices
### 1. Respect Robots.txt
Always check a website's `robots.txt` file:
```{python}
import requests
def check_robots_txt(domain):
robots_url = f"https://{domain}/robots.txt"
response = requests.get(robots_url)
return response.text
# Example: Check robots.txt for books.toscrape.com
# print(check_robots_txt("books.toscrape.com"))
```
### 2. Rate Limiting
Be respectful and don't overwhelm servers:
```{python}
import time
import random
def polite_requests(urls, delay_range=(1, 3)):
"""Make requests with random delays"""
for url in urls:
response = requests.get(url)
# Random delay between requests
time.sleep(random.uniform(*delay_range))
yield response
# Example usage:
# urls = [
# "http://books.toscrape.com/catalogue/page-1.html",
# "http://books.toscrape.com/catalogue/page-2.html",
# "http://books.toscrape.com/catalogue/page-3.html"
# ]
# for response in polite_requests(urls):
# print(f"Fetched: {response.url}")
```
### 3. Error Handling
Always implement proper error handling:
```{python}
def safe_request(url, max_retries=3):
"""Make HTTP requests with retry logic"""
for attempt in range(max_retries):
try:
response = requests.get(url, timeout=10)
response.raise_for_status() # Raise exception for bad status codes
return response
except requests.exceptions.RequestException as e:
print(f"Attempt {attempt + 1} failed: {e}")
if attempt < max_retries - 1:
time.sleep(2 ** attempt) # Exponential backoff
else:
raise
# Example usage:
# response = safe_request("http://books.toscrape.com/")
# print("Successfully fetched page!")
```
## Common Challenges and Solutions
### 1. Dealing with Pagination
```{python}
def extract_books_from_page(html_content):
"""Extract book data from HTML content"""
soup = BeautifulSoup(html_content, "html.parser")
books = soup.find_all("article", class_="product_pod")
page_data = []
for book in books:
title = book.h3.a["title"]
price = book.find("p", class_="price_color").text
stock = book.find("p", class_="instock availability").text.strip()
rating = book.p["class"][1]
page_data.append(
{"title": title, "price": price, "stock": stock, "rating": rating}
)
return page_data
def scrape_multiple_pages(base_url, max_pages=5):
"""Scrape multiple pages of a website"""
all_data = []
for page in range(1, max_pages + 1):
url = f"{base_url}catalogue/page-{page}.html"
response = requests.get(url)
if response.status_code == 200:
# Extract data from current page
page_data = extract_books_from_page(response.content)
all_data.extend(page_data)
print(f"Scraped {len(page_data)} books from page {page}")
else:
print(f"Failed to fetch page {page}")
break
return all_data
# Example usage:
books_data = scrape_multiple_pages("http://books.toscrape.com/", 3)
print(f"Scraped {len(books_data)} books from 3 pages")
```
### 2. Handling Forms and Authentication
```{python}
def login_and_scrape(login_url, target_url, credentials):
"""Handle login before scraping"""
session = requests.Session()
# Login
login_data = {
'username': credentials['username'],
'password': credentials['password']
}
response = session.post(login_url, data=login_data)
# Scrape protected content
if response.status_code == 200:
protected_response = session.get(target_url)
return protected_response.content
```